# Import packages
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_datareader as pdr
import statsmodels.api as sm
L16: Linear regression applications
Preliminaries
# Load Fama-French factor data
= pdr.DataReader('F-F_Research_Data_Factors', 'famafrench', '2012-01-01')[0]/100
ff3f 2) ff3f.head(
Mkt-RF | SMB | HML | RF | |
---|---|---|---|---|
Date | ||||
2012-01 | 0.0505 | 0.0206 | -0.0094 | 0.0 |
2012-02 | 0.0442 | -0.0186 | 0.0043 | 0.0 |
Load data on TSLA and clean it:
# Download monthly prices (keep only Adjusted Close prices)
= yf.download('TSLA', '2012-12-01', '2020-12-31', interval = '1mo')['Adj Close'].dropna().to_frame()
firm_prices
# Calculate monthly returns, drop missing, convert from Series to DataFrame
= firm_prices.pct_change().dropna()
firm_ret
# Rename "Adj Close" to "TSLA"
= {'Adj Close': 'TSLA'}, inplace = True)
firm_ret.rename(columns
# Convert index to monthly period date
= firm_ret.index.to_period('M')
firm_ret.index 2) firm_ret.head(
[*********************100%***********************] 1 of 1 completed
TSLA | |
---|---|
Date | |
2013-01 | 0.107470 |
2013-02 | -0.071448 |
# Merge the two datasets
= firm_ret.join(ff3f)
data 'const'] = 1
data[2) data.head(
TSLA | Mkt-RF | SMB | HML | RF | const | |
---|---|---|---|---|---|---|
Date | ||||||
2013-01 | 0.107470 | 0.0557 | 0.0031 | 0.0095 | 0.0 | 1 |
2013-02 | -0.071448 | 0.0129 | -0.0033 | 0.0010 | 0.0 | 1 |
# Set up the data
# Dependent variable (left side of the equal sign)
= data['TSLA'] - data['RF']
y 2) y.head(
Date
2013-01 0.107470
2013-02 -0.071448
Freq: M, dtype: float64
# Independent variable(s) (right side of the equal sign)
= data[['const','Mkt-RF']]
X 2) X.head(
const | Mkt-RF | |
---|---|---|
Date | ||
2013-01 | 1 | 0.0557 |
2013-02 | 1 | 0.0129 |
# Run regression and store results in "res" object
= sm.OLS(y,X).fit()
res print(res.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.155
Model: OLS Adj. R-squared: 0.146
Method: Least Squares F-statistic: 17.26
Date: Mon, 28 Mar 2022 Prob (F-statistic): 7.19e-05
Time: 08:49:43 Log-Likelihood: 30.143
No. Observations: 96 AIC: -56.29
Df Residuals: 94 BIC: -51.16
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0416 0.019 2.185 0.031 0.004 0.079
Mkt-RF 1.8300 0.441 4.154 0.000 0.955 2.705
==============================================================================
Omnibus: 29.750 Durbin-Watson: 1.561
Prob(Omnibus): 0.000 Jarque-Bera (JB): 56.488
Skew: 1.227 Prob(JB): 5.42e-13
Kurtosis: 5.846 Cond. No. 24.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Challenge:
Estimate the Fama-French three factor model using the data gathered above
# Set up X variables
= data[['const','Mkt-RF','SMB','HML']]
X3 # Run regression
= sm.OLS(y,X3).fit()
res3 print(res3.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.164
Model: OLS Adj. R-squared: 0.137
Method: Least Squares F-statistic: 6.021
Date: Mon, 28 Mar 2022 Prob (F-statistic): 0.000862
Time: 08:49:43 Log-Likelihood: 30.657
No. Observations: 96 AIC: -53.31
Df Residuals: 92 BIC: -43.06
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0371 0.020 1.873 0.064 -0.002 0.076
Mkt-RF 1.8838 0.481 3.919 0.000 0.929 2.838
SMB 0.1771 0.794 0.223 0.824 -1.400 1.755
HML -0.6607 0.667 -0.991 0.324 -1.985 0.664
==============================================================================
Omnibus: 31.331 Durbin-Watson: 1.586
Prob(Omnibus): 0.000 Jarque-Bera (JB): 62.428
Skew: 1.268 Prob(JB): 2.78e-14
Kurtosis: 6.029 Cond. No. 45.0
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Common uses for regression results
Assume that we ran a regression of the form:
\[y_t = \alpha + \beta \cdot x_t + \epsilon_t \]
In the CAPM regression we ran above, \(y_t\) is the excess return on TSLA and \(x_t\) is the excess return on the market:
\[R_{i,t} - R_{f,t} = \alpha_i + \beta_i (R_{m,t} - R_{f,t}) + \epsilon_{i,t}\]
Conditional predictions
We can use the results of our regression to estimate what we should expect the value of the dependent variable to be, if we knew the value of the independent variable(s). Mathematically, this is given by:
\[ E[y_t | x_t] = \alpha + \beta \cdot x_t \]
Example:
Using the results from the single-factor regression above, what is the expected excess return of TSLA if the market excess return is 2%?
# Extract coefficients from the results object
= res.params['const']
alpha = res.params['Mkt-RF']
beta print("alpha=",alpha,'\n','beta=',beta)
alpha= 0.041630987538345834
beta= 1.829989418169259
# Conditional prediction
= alpha + beta * 0.02
tsla_cond_prediction print("Expected excess return of TSLA if market excess return is 2%: ", tsla_cond_prediction)
Expected excess return of TSLA if market excess return is 2%: 0.07823077590173103
Challenge:
Using the results from the three-factor regression above, what is the expected excess return of TSLA if the market excess return is 2%, the SMB return -1% and the HML return is 0.5%?
# Extract params
= res3.params['const']
alpha3 = res3.params['Mkt-RF']
beta_mkt = res3.params['SMB']
beta_smb = res3.params['HML'] beta_hml
# Prediction
= alpha3 + beta_mkt * 0.02 + beta_smb * (-0.01) + beta_hml * 0.005
tsla_uncond_pred3 tsla_uncond_pred3
0.06971839299927778
1, 0.02, -0.01, 0.005] @ res3.params [
0.06971839299927778
Unconditional predictions
We can use the results of our regression to estimate what we should expect the value of the dependent variable to be, using our best guess for the value of the independent variable(s). Mathematically, this is given by:
\[ E[y_t] = \alpha + \beta \cdot E[x_t] \]
Example:
Using the results from the regression above, what is the expected excess return of TSLA (i.e the risk premium on TSLA)? To answer this question, we must first estimate \(E[R_m - R_f]\) (i.e. the market risk premium). We do so by taking an average of the excess returns on the market over a very long time (below we use the last 90 years).
# Download 100 years of data on market excess returns
= pdr.DataReader('F-F_Research_Data_Factors', 'famafrench', '1930-01-01','2020-12-31')[0]/100
ff3f_long 2) ff3f_long.head(
Mkt-RF | SMB | HML | RF | |
---|---|---|---|---|
Date | ||||
1930-01 | 0.0561 | 0.0353 | -0.0092 | 0.0014 |
1930-02 | 0.0250 | 0.0019 | 0.0020 | 0.0030 |
# Estimate (monthly) market risk premium
= ff3f_long['Mkt-RF'].mean()
mkt_risk_premium mkt_risk_premium
0.0066291208791208825
# Estimate TSLA risk premium
= alpha + beta * mkt_risk_premium
tsla_uncond_prediction print("TSLA risk premium = ", tsla_uncond_prediction)
TSLA risk premium = 0.05376220859890195
Challenge:
Estimate the risk-premium of TSLA using the three-factor model, and risk-premia estimated using the last 90 years of data.
# Estimate risk-premia
= ff3f_long.mean()
premia premia
Mkt-RF 0.006629
SMB 0.002504
HML 0.003253
RF 0.002693
dtype: float64
# Estimate TSLA risk premium
= alpha3 + beta_mkt * premia['Mkt-RF'] + beta_smb * premia['SMB'] + beta_hml * premia['HML']
tsla_premium_3f tsla_premium_3f
0.047898710808046654
Variance decomposition
The regression results can allow us to decompose the total variance of the dependent variable into the portion that can be explained by the variance in the explanatory variables and the portion that can not be explained by these variables. Mathematically, the regression equation implies:
\[ Var[Y] = \beta^2 \cdot Var[X] + Var[\epsilon] \]
Example:
Using the results from the regression above, calculate the total variance of TSLA, as well as its systematic variance and its idiosyncratic variance.
# Total risk of tesla (variance)
= y.var()
tot_risk tot_risk
0.037372232872271184
# Systematic risk
= (beta**2) * data['Mkt-RF'].var()
sys_risk sys_risk
0.005796835794247544
# Idiosyncratic risk
= tot_risk - sys_risk
idio_risk idio_risk
0.03157539707802364
# Another way of calculating idiosyncratic risk (=variance of residuals (epsilon) from the regression )
= res.resid.var() # res.resid gives us the residuals from the regression (the epsilons)
idio_risk2 idio_risk2
0.03157539707802364
# Print all three of them out
print(f'TSLA risk: \n total = {tot_risk: .4f} \n systematic = {sys_risk: .4f} \n idiosyncratic = {idio_risk: .4f}')
TSLA risk:
total = 0.0374
systematic = 0.0058
idiosyncratic = 0.0316
# Print as percentages of total risk
= sys_risk / tot_risk
pct_sys_risk = idio_risk / tot_risk
pct_idio_risk print(f'\n percent systematic risk = {pct_sys_risk: .4f} \n percent idiosyncratic risk = {pct_idio_risk: .4f}')
percent systematic risk = 0.1551
percent idiosyncratic risk = 0.8449
Challenge:
Using the Fama-French three factor model, what percentage of TSLA total risk is diversifiable and what percentage is undiversifiable?
print("TSLA pct diversifiable / idio risk = ", 1 - res3.rsquared)
print("TSLA pct non-diversifiable / sys risk = ", res3.rsquared)
TSLA pct diversifiable / idio risk = 0.8358803520627377
TSLA pct non-diversifiable / sys risk = 0.1641196479372623
Interaction effects between explanatory variables
In some circumstances, we might want our linear regression model to allow the relation between two variables to depend on a third variable:
\[ Y_t = \alpha + (\beta + \gamma \cdot Z_t) \cdot X_t + \delta \cdot Z_t + \epsilon_t \]
Note that the effect of X on Y (i.e. \(\beta + \gamma \cdot Z_t\)) depends on the value of a third variable (\(Z_t\)).
The regression above is often written (equivalently) as:
\[ Y_t = \alpha + \beta \cdot X_t + \gamma \cdot Z_t \cdot X_t + \delta \cdot Z_t + \epsilon_t \]
where the \(Z_t \cdot X_t\) term is called the interaction between the X and Z variables. This interaction term needs to be constructed in the data before we run our regression (by taking the product of X and Z).
Dummy variables (or “indicator” variables) are variables that take only the values 0 or 1. They are often used in interaction terms (as the \(Z\) variable above) to test if the relation between the main variables of interest (Y and X) is significantly different when some particular condition is satisfied (i.e. Z will equal 1 when the condition is satisfied and 0 when it is not).
Example:
Using the same data as in the regressions above, test if: 1. TSLA’s alpha is significantly different before 2015 than after 2015. 2. TSLA’s beta is significantly different before 2015 than after 2015.
In this example, the \(Z_t\) variable will have a value of 0 before 2015 and a value of 1 after 2015. So, before 2015, the equation above becomes
\[ Y_t = \alpha + \beta \cdot X_t + \epsilon_t \]
and after 2015, it becomes
\[ Y_t = \alpha + \delta + (\beta + \gamma) X_t + \epsilon_t \]
Hence, \(\delta\) tells us the difference between the firm’s alpha after 2015 and its alpha before 2015. And \(\gamma\) tells us the difference between the firm’s beta after 2015 and its beta before 2015.
# Create dummy variable that = 1 after 2015 and 0 before
'post2015'] = np.where(data.index.year>2015,1,0)
data[ data
TSLA | Mkt-RF | SMB | HML | RF | const | post2015 | |
---|---|---|---|---|---|---|---|
Date | |||||||
2013-01 | 0.107470 | 0.0557 | 0.0031 | 0.0095 | 0.0000 | 1 | 0 |
2013-02 | -0.071448 | 0.0129 | -0.0033 | 0.0010 | 0.0000 | 1 | 0 |
2013-03 | 0.087855 | 0.0403 | 0.0083 | -0.0023 | 0.0000 | 1 | 0 |
2013-04 | 0.424914 | 0.0155 | -0.0236 | 0.0050 | 0.0000 | 1 | 0 |
2013-05 | 0.810706 | 0.0280 | 0.0172 | 0.0267 | 0.0000 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... |
2020-08 | 0.741452 | 0.0763 | -0.0022 | -0.0293 | 0.0001 | 1 | 1 |
2020-09 | -0.139087 | -0.0363 | -0.0004 | -0.0266 | 0.0001 | 1 | 1 |
2020-10 | -0.095499 | -0.0210 | 0.0439 | 0.0419 | 0.0001 | 1 | 1 |
2020-11 | 0.462736 | 0.1247 | 0.0574 | 0.0199 | 0.0001 | 1 | 1 |
2020-12 | 0.243252 | 0.0463 | 0.0483 | -0.0156 | 0.0001 | 1 | 1 |
96 rows × 7 columns
# Create interaction term
'mkt_x_post2015'] = data['Mkt-RF'] * data['post2015']
data[ data
TSLA | Mkt-RF | SMB | HML | RF | const | post2015 | mkt_x_post2015 | |
---|---|---|---|---|---|---|---|---|
Date | ||||||||
2013-01 | 0.107470 | 0.0557 | 0.0031 | 0.0095 | 0.0000 | 1 | 0 | 0.0000 |
2013-02 | -0.071448 | 0.0129 | -0.0033 | 0.0010 | 0.0000 | 1 | 0 | 0.0000 |
2013-03 | 0.087855 | 0.0403 | 0.0083 | -0.0023 | 0.0000 | 1 | 0 | 0.0000 |
2013-04 | 0.424914 | 0.0155 | -0.0236 | 0.0050 | 0.0000 | 1 | 0 | 0.0000 |
2013-05 | 0.810706 | 0.0280 | 0.0172 | 0.0267 | 0.0000 | 1 | 0 | 0.0000 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2020-08 | 0.741452 | 0.0763 | -0.0022 | -0.0293 | 0.0001 | 1 | 1 | 0.0763 |
2020-09 | -0.139087 | -0.0363 | -0.0004 | -0.0266 | 0.0001 | 1 | 1 | -0.0363 |
2020-10 | -0.095499 | -0.0210 | 0.0439 | 0.0419 | 0.0001 | 1 | 1 | -0.0210 |
2020-11 | 0.462736 | 0.1247 | 0.0574 | 0.0199 | 0.0001 | 1 | 1 | 0.1247 |
2020-12 | 0.243252 | 0.0463 | 0.0483 | -0.0156 | 0.0001 | 1 | 1 | 0.0463 |
96 rows × 8 columns
# Get new independent variables
= data[['const', 'post2015', 'Mkt-RF', 'mkt_x_post2015']]
newX # Run regression and print results
= sm.OLS(y, newX).fit()
res_interact print(res_interact.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.167
Model: OLS Adj. R-squared: 0.140
Method: Least Squares F-statistic: 6.147
Date: Mon, 28 Mar 2022 Prob (F-statistic): 0.000742
Time: 08:49:43 Log-Likelihood: 30.822
No. Observations: 96 AIC: -53.64
Df Residuals: 92 BIC: -43.39
Df Model: 3
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
const 0.0605 0.032 1.887 0.062 -0.003 0.124
post2015 -0.0266 0.040 -0.665 0.507 -0.106 0.053
Mkt-RF 0.8907 0.963 0.925 0.358 -1.022 2.804
mkt_x_post2015 1.1919 1.084 1.099 0.274 -0.961 3.345
==============================================================================
Omnibus: 32.215 Durbin-Watson: 1.569
Prob(Omnibus): 0.000 Jarque-Bera (JB): 63.959
Skew: 1.309 Prob(JB): 1.29e-14
Kurtosis: 6.022 Cond. No. 93.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Is the beta of the firm significantly different (at the 5% level) after 2015?
print("Is beta sig different after 2015?\n", res_interact.pvalues['mkt_x_post2015'] < 0.05)
Is beta sig different after 2015?
False
# Is the alpha of the firm significantly different (at the 5% level) after 2015?
print("Is alpha sig different after 2015?\n", res_interact.pvalues['post2015'] < 0.05)
Is alpha sig different after 2015?
False
Non-linear explanatory variables
In some circumstances we might want to test if there is a significant non-linear relationship between two variables of interest. For example, to test for a quadratic relation between Y and X, we can run the following regression:
\[ Y_t = \alpha + \beta \cdot X_t + \gamma \cdot X_t^2 + \epsilon_t\]
The \(X^2\) variable needs to be created ahead of time in the data, before we run the regression.
Example:
Using the market model above, test if there is a significant quadratic relation between TSLA excess returns and market excess returns.
# Create quadratic market excess returns
'mkt_squared'] = data['Mkt-RF']**2
data[ data.head()
TSLA | Mkt-RF | SMB | HML | RF | const | post2015 | mkt_x_post2015 | mkt_squared | |
---|---|---|---|---|---|---|---|---|---|
Date | |||||||||
2013-01 | 0.107470 | 0.0557 | 0.0031 | 0.0095 | 0.0 | 1 | 0 | 0.0 | 0.003102 |
2013-02 | -0.071448 | 0.0129 | -0.0033 | 0.0010 | 0.0 | 1 | 0 | 0.0 | 0.000166 |
2013-03 | 0.087855 | 0.0403 | 0.0083 | -0.0023 | 0.0 | 1 | 0 | 0.0 | 0.001624 |
2013-04 | 0.424914 | 0.0155 | -0.0236 | 0.0050 | 0.0 | 1 | 0 | 0.0 | 0.000240 |
2013-05 | 0.810706 | 0.0280 | 0.0172 | 0.0267 | 0.0 | 1 | 0 | 0.0 | 0.000784 |
# Get new independent variables
= data[['const', 'Mkt-RF', 'mkt_squared']]
sqX # Run regression and print results
= sm.OLS(y, sqX).fit()
res_squared print(res_squared.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.171
Model: OLS Adj. R-squared: 0.153
Method: Least Squares F-statistic: 9.586
Date: Mon, 28 Mar 2022 Prob (F-statistic): 0.000164
Time: 08:49:43 Log-Likelihood: 31.049
No. Observations: 96 AIC: -56.10
Df Residuals: 93 BIC: -48.41
Df Model: 2
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
const 0.0288 0.021 1.354 0.179 -0.013 0.071
Mkt-RF 1.7652 0.441 3.999 0.000 0.889 2.642
mkt_squared 7.2917 5.476 1.332 0.186 -3.582 18.165
==============================================================================
Omnibus: 30.680 Durbin-Watson: 1.567
Prob(Omnibus): 0.000 Jarque-Bera (JB): 62.921
Skew: 1.222 Prob(JB): 2.17e-14
Kurtosis: 6.123 Cond. No. 302.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.